Load packages

library(ggplot2)
library(Rtsne)
library(phyloseq)
library(ape)
library(gridExtra)
library(svglite) # Requires Cairo. Run "brew install cairo" if on a mac
library(pheatmap)
library(RColorBrewer)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter,
##     Find, get, grep, grepl, intersect, is.unsorted, lapply, Map,
##     mapply, match, mget, order, paste, pmax, pmax.int, pmin,
##     pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
##     setdiff, sort, table, tapply, union, unique, unsplit, which,
##     which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## The following object is masked from 'package:phyloseq':
## 
##     distance
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:phyloseq':
## 
##     sampleNames
## Loading required package: DelayedArray
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## The following object is masked from 'package:dplyr':
## 
##     count
## Loading required package: BiocParallel
## 
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following objects are masked from 'package:base':
## 
##     aperm, apply, rowsum
library(reshape2)
library(tibble)

#library(devtools)
#install_github("opisthokonta/tsnemicrobiota")
library(tsnemicrobiota)

#BiocManager::install("GO.db")
#BiocManager::install("impute")
#BiocManager::install("preprocessCore")
#install_github("umerijaz/microbiomeSeq") 
library(microbiomeSeq)
## Registered S3 methods overwritten by 'adegraphics':
##   method         from
##   biplot.dudi    ade4
##   kplot.foucart  ade4
##   kplot.mcoa     ade4
##   kplot.mfa      ade4
##   kplot.pta      ade4
##   kplot.sepan    ade4
##   kplot.statis   ade4
##   scatter.coa    ade4
##   scatter.dudi   ade4
##   scatter.nipals ade4
##   scatter.pco    ade4
##   score.acm      ade4
##   score.mix      ade4
##   score.pca      ade4
##   screeplot.dudi ade4
## Registered S3 method overwritten by 'spdep':
##   method   from
##   plot.mst ape
## Registered S3 methods overwritten by 'adespatial':
##   method             from       
##   plot.multispati    adegraphics
##   print.multispati   ade4       
##   summary.multispati ade4
library(igraph)
## 
## Attaching package: 'igraph'
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
## The following objects are masked from 'package:DelayedArray':
## 
##     path, simplify
## The following object is masked from 'package:GenomicRanges':
## 
##     union
## The following object is masked from 'package:IRanges':
## 
##     union
## The following object is masked from 'package:S4Vectors':
## 
##     union
## The following objects are masked from 'package:BiocGenerics':
## 
##     normalize, path, union
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:ape':
## 
##     edges, mst, ring
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(visNetwork)

library(ComplexHeatmap)
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.0.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##   genomic data. Bioinformatics 2016.
## ========================================
library(circlize)
## ========================================
## circlize version 0.4.8
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: http://jokergoo.github.io/circlize_book/book/
## 
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization 
##   in R. Bioinformatics 2014.
## ========================================
## 
## Attaching package: 'circlize'
## The following object is masked from 'package:igraph':
## 
##     degree
library(ggsci)

library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following object is masked from 'package:BiocGenerics':
## 
##     combine
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:IRanges':
## 
##     cov, var
## The following objects are masked from 'package:S4Vectors':
## 
##     cov, var
## The following object is masked from 'package:BiocGenerics':
## 
##     var
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

Load data

# Phyloseq object (Both abundance and taxonomic info)
physeq = readRDS("/Users/angelol/Documents/PhD/Gut-microbiome-immunotherapy/data/Abundance_tables/physeq.rds")

# Abundance table for each OTU
OTU = readRDS("/Users/angelol/Documents/PhD/Gut-microbiome-immunotherapy/data/Abundance_tables/OTU.rds")
# Clinical metadata
clin = readRDS("/Users/angelol/Documents/PhD/Gut-microbiome-immunotherapy/Metadata/Processed_metadata/clin.rds")
# Pyholgenetic metadata
phylo_meta = readRDS("/Users/angelol/Documents/PhD/Gut-microbiome-immunotherapy/Metadata/Processed_metadata/phylo_meta.rds")

Pre-process data

Let’s filter out microbes that are not identified across at least 5 samples and convert to relative abundance

physeq_f = filter_taxa(physeq, function(x) sum(x > 0) > 5, TRUE)
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'RNeXML'
physeq_f = transform_sample_counts(physeq_f, function(x) x/sum(x))
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'RNeXML'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'RNeXML'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'RNeXML'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'RNeXML'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'RNeXML'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'RNeXML'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'RNeXML'
OTU = apply(OTU, 2, function(x){x/sum(x)})
# Filter out microbes that are not identified across at least 5 samples
OTU_filtered = OTU[apply(OTU, 1, function(x) sum(x > 0)) > 5,]
# Sort according to most abundant OTUs
OTU_filtered = OTU_filtered[order(rowSums(OTU_filtered),decreasing = TRUE),]

##Explore data

Let’s first visualize differences in microbial composition across patients using PCoA. We will use weighted UNIFRAC as our distance matrix.

# Calculate weighted UniFrac distances between all filtered samples
distance.matrix = UniFrac(physeq_f,weighted = TRUE)
# Perform PCoA
PCoA = cmdscale(distance.matrix, eig = TRUE, x.ret = TRUE)
# Extract variance information for each PCoA axis
PCoA.variance = round(PCoA$eig/sum(PCoA$eig)*100, 1)
# Extract values for each component
PCoA.values = PCoA$points
# Format data for ggplot
PCoA.data = data.frame(Sample = rownames(PCoA.values),
                       X = PCoA.values[,1],
                       Y = PCoA.values[,2],
                       Response = gsub(".*_R","R",gsub(".*_NR","NR",rownames(PCoA.values))),
                       Study = gsub(".*_Rou_.*","Routy",(gsub(".*_Gop_.*","Gopalakrishnan",gsub(".*_Mat_.*","Matson",gsub(".*_Fra_.*","Frankel",rownames(PCoA.values)))))))

# Call ggplot
p_MDS = ggplot(data = PCoA.data, aes(x = X, y = Y)) +
  geom_point(size = 2, aes(color = Response, shape = Study)) +
  stat_ellipse(aes(color = Response, group = Response),size = 1, linetype = 1) +
  stat_ellipse(aes(group = Study), linetype = 3, color = "grey70") +
  xlab(paste("MDS1: ", PCoA.variance[1],"%", sep = "")) +
  ylab(paste("MDS2: ", PCoA.variance[2],"%", sep = "")) +
  ggtitle("All samples") +
  theme_classic() +
  theme(plot.title = element_text(face="bold")) +
  theme(text = element_text(family = "Helvetica")) +
  theme(axis.text.x=element_blank(),
        axis.text.y=element_blank()) +
  theme(aspect.ratio=1) +
  scale_color_manual(values=c("violetred", "darkturquoise"))
p_MDS

# Perform the analysis for all studies separately
studies = c("Gopalakrishnan","Matson","Frankel","Routy")
# Initialize empty plot list
p_list = list()
for (i in 1:length(studies)) {
  local({ #Sets local enviroment in order to not overwrite plot data
    
    # Extract three letter study ID
    study_id = substr(studies[i], start = 1, stop = 3)
    # Subset OTU_filtered matrix
    physeq_f_study = prune_samples(samples = grepl(study_id,sample_names(physeq_f)), physeq_f)
    # Calculate weighted UniFrac distances between all filtered samples
    distance.matrix = UniFrac(physeq_f_study,weighted = TRUE)
    # Perform PCoA
    PCoA = cmdscale(distance.matrix, eig = TRUE, x.ret = TRUE)
    # Extract variance information for each PCoA axis
    PCoA.variance = round(PCoA$eig/sum(PCoA$eig)*100, 1)
    # Extract values for each component
    PCoA.values = PCoA$points
    # Format data for ggplot
    PCoA.data = data.frame(Sample = rownames(PCoA.values),
                           X = PCoA.values[,1],
                           Y = PCoA.values[,2],
                           Response = gsub(".*_R","R",gsub(".*_NR","NR",rownames(PCoA.values))))
    # Call ggplot
    p_MDS_study = ggplot(data = PCoA.data, aes(x = X, y = Y)) +
      geom_point(size = 2, aes(color = Response), show.legend = FALSE) +
      stat_ellipse(aes(X,Y,color=Response, group=Response), show.legend = FALSE) +
      xlab(paste("MDS1: ", PCoA.variance[1],"%", sep = "")) +
      ylab(paste("MDS2: ", PCoA.variance[2],"%", sep = "")) +
      ggtitle(paste(studies[i]," et al.", sep = "")) +
      theme_classic() +
      theme(plot.title = element_text(face="bold")) +
      theme(text = element_text(family = "Helvetica")) +
      theme(axis.text.x=element_blank(),
            axis.text.y=element_blank()) +
      theme(aspect.ratio=1) +
      scale_color_manual(values=c("violetred", "darkturquoise"))
    # Save to plot list
    p_list[[i]] <<- p_MDS_study
  })
}

grid.arrange(
  grobs = p_list,
  layout_matrix = rbind(c(1, 2),
                        c(3, 4))
)

Let’s also try manifold learning using t-SNE with weighted UniFrac.

#Perform tSNE on filtered samples
set.seed(9)
tsne_res = tsne_phyloseq(physeq_f, distance='wunifrac',
                     perplexity = 20, verbose=0, rng_seed = 9)

tsne.data = data.frame(Sample = sample_names(physeq_f),
                       X = tsne_res$tsne$par[,1],
                       Y = tsne_res$tsne$par[,2],
                       Response = gsub(".*_R","R",gsub(".*_NR","NR",sample_names(physeq_f))),
                       Study = gsub(".*_Rou_.*","Routy",(gsub(".*_Gop_.*","Gopalakrishnan",gsub(".*_Mat_.*","Matson",gsub(".*_Fra_.*","Frankel",rownames(PCoA.values)))))))
# Call ggplot
p_tsne <- ggplot(tsne.data, aes(x=X, y=Y)) +
  geom_point(size = 2, aes(color = Response, shape = Study)) +
  stat_ellipse(aes(color = Response, group = Response),size = 1, linetype = 1) +
  stat_ellipse(aes(group = Study), linetype = 3, color = "grey70") +
  xlab("t-SNE 1") +
  ylab("t-SNE 2") +
  ggtitle("All samples") +
  theme_classic() +
  theme(plot.title = element_text(face="bold")) +
  theme(text = element_text(family = "Helvetica")) +
  theme(axis.text.x=element_blank(),
        axis.text.y=element_blank()) +
  theme(aspect.ratio=1) +
  scale_color_manual(values=c("violetred", "darkturquoise"))
p_tsne

# Perform the analysis for all studies separately
studies = c("Gopalakrishnan","Matson","Frankel","Routy")
# Initialize empty plot list
p_list_tsne = list()
for (i in 1:length(studies)) {
  local({ #Sets local enviroment in order to not overwrite plot data
    
    # Extract three letter study ID
    study_id = substr(studies[i], start = 1, stop = 3)
    # Subset OTU_filtered matrix
    physeq_f_study = prune_samples(samples = grepl(study_id,sample_names(physeq_f)), physeq_f)
    # Perform tSNE
    tsne_res = tsne_phyloseq(physeq_f_study, distance='wunifrac',
                             perplexity = 20, verbose=0, rng_seed = 9)
    # Format data for ggplot
    tsne.data = data.frame(Sample = sample_names(physeq_f_study),
                           X = tsne_res$tsne$par[,1],
                           Y = tsne_res$tsne$par[,2],
                           Response = gsub(".*_R","R",gsub(".*_NR","NR",sample_names(physeq_f_study))))
    # Call ggplot
    p_tsne_study <- ggplot(tsne.data, aes(x=X, y=Y)) +
      geom_point(size = 2, aes(color = Response)) +
      stat_ellipse(aes(color = Response, group = Response),size = 1, linetype = 1) +
      xlab("t-SNE 1") +
      ylab("t-SNE 2") +
      ggtitle(paste(studies[i]," et al.", sep = "")) +
      theme_classic() +
      theme(plot.title = element_text(face="bold")) +
      theme(text = element_text(family = "Helvetica")) +
      theme(axis.text.x=element_blank(),
            axis.text.y=element_blank()) +
      theme(aspect.ratio=1) +
      scale_color_manual(values=c("violetred", "darkturquoise"))
    # Save to plot list
    p_list_tsne[[i]] <<- p_tsne_study
  })
}

grid.arrange(
  grobs = p_list_tsne,
  layout_matrix = rbind(c(1, 2),
                        c(3, 4))
)

Alpha diversity

plot_anova_diversity(physeq, method = c("shannon", "richness","simpson","fisher"), grouping_column = "Response", pValueCutoff = 0.05)

plot_anova_diversity(physeq, method = c("shannon", "richness","simpson","fisher"), grouping_column = "Study", pValueCutoff = 0.05)

Let’s also check differences for each study

studies = c("Gopalakrishnan","Matson","Frankel","Routy")

# Define diversity indeces to be used
# - Shannon: Estimator of species richness and species evenness, more weight on species richness.
# - InvSimpson: Estimator of species richness and species evenness, more weight on species evenness.
# - ACE: Abundance-based Coverage Estimator of species richness.
# - Chao1: Abundance-based estimator of species richness.

div_index = c("Shannon","InvSimpson","ACE","Chao1")
# Create empty matrix for storing pvalues for significance testing between R and NR
pvalue_mat = data.frame(matrix(data = NA,nrow = length(studies), ncol = length(div_index)))
rownames(pvalue_mat) = studies
colnames(pvalue_mat) = div_index
p_list = list()
for (i in 1:length(studies)) {
  local({ #Sets local enviroment in order to not overwrite plot data
    # Extract three letter study ID
    study_id = substr(studies[i], start = 1, stop = 3)
    # Subset OTU_filtered matrix
    physeq_f_study = prune_samples(samples = grepl(study_id,sample_names(physeq)), physeq)
    for (j in 1:length(div_index)) {
      #Calculate alpha diversity
      alpha_div_study = estimate_richness(physeq_f_study, measures = div_index[j]) 
      # Extract diversities for responders and non responders
      R_div_study = as.numeric(alpha_div_study[grepl("_R",rownames(alpha_div_study)),1])
      NR_div_study = as.numeric(alpha_div_study[grepl("_NR",rownames(alpha_div_study)),1])
      # Perform wilcoxon rank sum test
      pvalue_mat[i,j] <<- wilcox.test(R_div_study, NR_div_study, alternative = "two.sided")$p.value
    }
    # Create y-axis label for grid.arrange
    if(i==1){label = "Index value"}
    else{label = ""}
    # Plot alpha diversity (Shannon only)
    p_study = plot_richness(physeq_f_study,x = "Response", measures = "Shannon", color = "Response") +
      #geom_boxplot() +
      theme(legend.position = "none") +
      ylab(label) +
      xlab("")
    # Save to plot list
    p_list[[i]] <<- p_study
  })
}

p_all = grid.arrange(grobs = p_list,
                     layout_matrix = rbind(c(1, 2, 3, 4),
                                           c(1, 2, 3, 4))
)

Distributions of taxa

Let’s barplots in order to visualise differences in taxonomic distributions across responders vs non-responders.

taxlevel = c("Phylum","Order","Family","Genus")

for (i in 1:length(taxlevel)) {
  physeq_f_level = taxa_level(physeq_f, which_level = taxlevel[i])
  p_bar = plot_taxa(physeq_f_level, grouping_column = "Response", method = "hellinger", number.taxa = ifelse(i == 1,15,20), filename = NULL)

  # Extract the most present microbe in the data, used for sorting barplots.
  top_microbe = names(which.max(apply(otu_table(physeq_f_level), 2, sum)))
  # Order samples according to top microbe
  top_samples = otu_table(physeq_f_level)[,top_microbe]
  top_samples = top_samples[order(top_samples, decreasing = TRUE)]
  top_samples = rownames(top_samples)
  # Re-arrange the factors
  p_bar$data$Sample = factor(p_bar$data$Sample, levels = top_samples)
  
  p_bar = p_bar + 
    labs(title = paste0("Distribution of taxa at the ", tolower(taxlevel[i]), " level"),
         subtitle = paste0("Grouped by response")) +
    xlab("Samples") +
    ylab("Relative abundance") +
    theme(
    strip.background = element_blank(),
    axis.text = element_blank(),
    plot.title = element_text(face="bold"),
    text = element_text(family = "Helvetica"))
  
  print(p_bar)
}

Let’s also group samples by study.

taxlevel = c("Phylum","Order","Family","Genus")

for (i in 1:length(taxlevel)) {
  physeq_f_level = taxa_level(physeq_f, which_level = taxlevel[i])
  
  p_bar = plot_taxa(physeq_f_level, grouping_column = "Study", method = "hellinger", number.taxa = ifelse(i == 1,15,20), filename = NULL)
  
  # Extract the most present microbe in the data, used for sorting barplots.
  top_microbe = names(which.max(apply(otu_table(physeq_f_level), 2, sum)))
  # Order samples according to top microbe
  top_samples = otu_table(physeq_f_level)[,top_microbe]
  top_samples = top_samples[order(top_samples, decreasing = TRUE)]
  top_samples = rownames(top_samples)
  # Re-arrange the factors
  p_bar$data$Sample = factor(p_bar$data$Sample, levels = top_samples)
  
  p_bar = p_bar + 
    labs(title = paste0("Distribution of taxa at the ", tolower(taxlevel[i]), " level"),
         subtitle = paste0("Grouped by study")) +
    xlab("Samples") +
    ylab("Relative abundance") +
    theme_bw() +
    theme(strip.background = element_blank(),
          axis.text = element_blank(),
          plot.title = element_text(face="bold"),
          text = element_text(family = "Helvetica")) +
    scale_color_grey()
  
  print(p_bar)
}

Co-occurence analysis

# Select taxonomic level
physeq_level = taxa_level(physeq, which_level = "Genus")


# Enriched in responders
co_occr = co_occurence_network(physeq_level, grouping_column = "Response", rhos = 0.3, 
    method = "cor", qval_threshold = 0.05, select.condition = "R", scale.vertex.size = 4, 
    scale.edge.width = 15, plotNetwork = T, plotBetweennessEeigenvalue = F)
## 
g <- co_occr$net$graph
data <- toVisNetworkData(g)
visNetwork(nodes = data$nodes,
           edges = data$edges,
           width = 900,
           height = 900) %>% visOptions(highlightNearest = TRUE, nodesIdSelection = TRUE)
# Enriched in non-responders
co_occr = co_occurence_network(physeq_level, grouping_column = "Response", rhos = 0.3, 
    method = "cor", qval_threshold = 0.05, select.condition = "NR", scale.vertex.size = 4, 
    scale.edge.width = 15, plotNetwork = T, plotBetweennessEeigenvalue = F)
g <- co_occr$net$graph
data <- toVisNetworkData(g)
visNetwork(nodes = data$nodes,
           edges = data$edges,
           width = 900,
           height = 900) %>% visOptions(highlightNearest = TRUE, nodesIdSelection = TRUE)

Differential abundance

Select taxonomic level:

physeq_f_level = taxa_level(physeq_f, which_level = "Species")

OTU_filtered = as.matrix(t(as.data.frame(otu_table(physeq_f_level))))
# Initialise empty data frame for storing differentially abundant OTUs
diffOTUs = data.frame(pvalue = 1:nrow(OTU_filtered), fold_change = 1:nrow(OTU_filtered), taxonomy = phylo_meta[rownames(OTU_filtered),1])
rownames(diffOTUs) = rownames(OTU_filtered)

# Loop over all OTUs
for (i in 1:nrow(OTU_filtered)) {
  # Extract abundances of each OTU for both R and NR
  R_abundance = as.numeric(OTU_filtered[i,grepl("_R",colnames(OTU_filtered))])
  NR_abundance = as.numeric(OTU_filtered[i,grepl("_NR",colnames(OTU_filtered))])
  
  diffOTUs$pvalue[i] = wilcox.test(R_abundance,NR_abundance, alternative = "two.sided")$p.value
  diffOTUs$fold_change[i] = log2((median(R_abundance) + 0.001)/(median(NR_abundance) + 0.001))
}

## Adjust for multiple testing
#diffOTUs$pvalue = p.adjust(diffOTUs$pvalue,method = "fdr")
# Order according to most significant
diffOTUs = diffOTUs[order(diffOTUs$pvalue),]
# Keep only significant microbes (P < 0.05)
diffOTUs = diffOTUs[diffOTUs$pvalue < 0.05,]

# Lets make a heatmap of the top differentially abundant microbes
hm.data = log10(OTU_filtered[rownames(diffOTUs),]+1e-10)
# Create annotations for heatmap
ha = HeatmapAnnotation(Response = as.factor(gsub(".*_R","R",gsub(".*_NR","NR",colnames(OTU)))),
                       Study = as.factor(substr(colnames(OTU), start = 6, stop = 8)),
                       col = list(Response = c(NR ="violetred", R = "darkturquoise"),
                                  Study = c(Gop = "burlywood1",Fra = "indianred1", Mat = "firebrick4", Rou = "chocolate")))
# Define color palette
col_fun = colorRamp2(c(-10,-5,-2), c("slateblue4","maroon3","lightyellow"))

# Draw heatmap
hm = Heatmap(hm.data,
        name = "log10 rel. ab.",
        row_names_side = "left",
        clustering_distance_rows = "euclidean",
        clustering_distance_columns = "euclidean",
        show_column_names = FALSE,
        width = unit(20, "cm"),
        height = unit(5, "cm"),
        top_annotation = ha,
        col = col_fun,
        column_dend_height = unit(50, "mm"))

# Add boxplots to each row
rg = range(hm.data)
pat_groups = as.logical(as.integer(as.factor(gsub(".*_R","R",gsub(".*_NR","NR",colnames(OTU_filtered))))) - 1L)
anno_multiple_boxplot = function(index) {
    pushViewport(viewport(xscale = rg, yscale = c(0.5, nrow(hm.data) + 0.5)))
    for(i in seq_along(index)) {
        grid.rect(y = nrow(hm.data)-i+1, height = 1, default.units = "native")
        grid.boxplot(hm.data[index[i], pat_groups], pos = nrow(hm.data)-i+1 + 0.2, box_width = 0.3, 
            gp = gpar(fill = "darkturquoise"), direction = "horizontal")
        grid.boxplot(hm.data[ index[i], !pat_groups], pos = nrow(hm.data)-i+1 - 0.2, box_width = 0.3, 
            gp = gpar(fill = "violetred"), direction = "horizontal")
    }
    grid.xaxis()
    popViewport()
}

hm = hm + rowAnnotation(boxplot = anno_multiple_boxplot, width =unit(4, "cm"), show_annotation_name = FALSE)

draw(hm)

Let’s perform the analysis for all studies separately as well

# Routy et al. is omitted due to no significant microbes being detected using our criteria for response
studies = c("Gopalakrishnan","Matson","Frankel")
# Initialize empty plot list
p_list = list()
for (i in 1:length(studies)) {
  local({ #Sets local enviroment in order to not overwrite plot data
    # Extract three letter study ID
    study_id = substr(studies[i], start = 1, stop = 3)
    # Pre-process each individual study
    # Subset the full OTU matrix
    OTU_filtered_study = OTU_filtered[,grepl(study_id,colnames(OTU_filtered))]
    # Sort according to most abundant OTUs
    OTU_filtered_study = OTU_filtered_study[order(rowSums(OTU_filtered_study),decreasing = TRUE),]
    # Initialise empty data frame for storing differentially abundant OTUs
    diffOTUs_study = data.frame(pvalue = 1:nrow(OTU_filtered_study),
                                fold_change = 1:nrow(OTU_filtered_study),
                                taxonomy = rownames(OTU_filtered_study))
    rownames(diffOTUs_study) = rownames(OTU_filtered_study)
    # Loop over all OTUs
    for (j in 1:nrow(OTU_filtered_study)) {
      # Extract abundances of each OTU for both R and NR
      R_abundance = as.numeric(OTU_filtered_study[j,grepl("_R",colnames(OTU_filtered_study))])
      NR_abundance = as.numeric(OTU_filtered_study[j,grepl("_NR",colnames(OTU_filtered_study))])
      # Perform wilcoxon rank sum test between responders and non-responders
      diffOTUs_study$pvalue[j] = wilcox.test(R_abundance,NR_abundance, alternative = "two.sided", exact = FALSE)$p.value
      diffOTUs_study$fold_change[j] = log2((median(R_abundance) + 0.001)/(median(NR_abundance) + 0.001))
    }
    ## Adjust for multiple testing
    #diffOTUs_study$pvalue = p.adjust(diffOTUs_study$pvalue,method = "fdr")
    # Remove NaN values
    diffOTUs_study = diffOTUs_study[!is.na(diffOTUs_study$pvalue),]
    # Order according to most significant
    diffOTUs_study = diffOTUs_study[order(diffOTUs_study$pvalue),]
    # Keep only significant microbes (P < 0.05)
    diffOTUs_study = diffOTUs_study[diffOTUs_study$pvalue < 0.05,]

    # Lets make a heatmap of the top differentially abundant microbes
    hm.data = log10(OTU_filtered_study[rownames(diffOTUs_study),]+1e-10)
    
    # Create annotations for heatmap
    ha = HeatmapAnnotation(Response = as.factor(gsub(".*_R","R",gsub(".*_NR","NR",colnames(OTU_filtered_study)))),
                           col = list(Response = c(NR ="violetred", R = "darkturquoise")),
                           which = "column")
    # Define color palette
    col_fun = colorRamp2(c(-10,-6,-2), c("slateblue4","maroon3","lightyellow"))
    
    hm = Heatmap(hm.data,
                 name = "log10 rel. ab.",
                 row_labels = as.character(gsub(".*s__","",diffOTUs_study$taxonomy)),
                 row_names_side = "left",
                 clustering_distance_rows = "euclidean",
                 clustering_distance_columns = "euclidean",
                 show_column_names = FALSE,
                 width = unit(20, "cm"),
                 height = unit(20, "cm"),
                 top_annotation = ha,
                 col = col_fun,
                 column_dend_height = unit(20, "mm"))
    
    # Add boxplots to each row
    rg = range(hm.data)
    pat_groups = as.logical(as.integer(as.factor(gsub(".*_R","R",gsub(".*_NR","NR",colnames(OTU_filtered_study))))) - 1L)
    anno_multiple_boxplot = function(index) {
        pushViewport(viewport(xscale = rg, yscale = c(0.5, nrow(hm.data) + 0.5)))
        for(i in seq_along(index)) {
            grid.rect(y = nrow(hm.data)-i+1, height = 1, default.units = "native")
            grid.boxplot(hm.data[index[i], pat_groups], pos = nrow(hm.data)-i+1 + 0.2, box_width = 0.3, 
                gp = gpar(fill = "darkturquoise"), direction = "horizontal")
            grid.boxplot(hm.data[ index[i], !pat_groups], pos = nrow(hm.data)-i+1 - 0.2, box_width = 0.3, 
                gp = gpar(fill = "violetred"), direction = "horizontal")
        }
        grid.xaxis()
        popViewport()
    }
    
    hm = hm + rowAnnotation(boxplot = anno_multiple_boxplot, width =unit(4, "cm"), show_annotation_name = FALSE)
    # Save to plot list
    p_list[[i]] <<- hm
  })
}
# Draw heatmap
for (i in 1:length(p_list)) {
  draw(p_list[[i]])
}

Let’s also try using Fisher’s exact test between R and NR

# Initialise empty data frame for storing differentially abundant OTUs
diffOTUs = data.frame(pvalue = 1:nrow(OTU_filtered), taxonomy = phylo_meta[rownames(OTU_filtered),1])
rownames(diffOTUs) = rownames(OTU_filtered)

# Loop over all OTUs
for (i in 1:nrow(OTU_filtered)) {
  # Extract abundances of each OTU for both R and NR
  R_abundance = as.numeric(OTU_filtered[i,grepl("_R",colnames(OTU_filtered))])
  NR_abundance = as.numeric(OTU_filtered[i,grepl("_NR",colnames(OTU_filtered))])
  # Convert to presence/absence
  R_abundance = R_abundance>0
  NR_abundance = NR_abundance>0
  # Create contingency table
  cont.table = table(data.frame(Response = c(rep("R",length(R_abundance)),rep("NR",length(NR_abundance))),
                                Presence = c(R_abundance,NR_abundance)))
  # Perform Fisher's exact test
  diffOTUs$pvalue[i] = fisher.test(cont.table)$p.value
}

## Adjust for multiple testing
#diffOTUs$pvalue = p.adjust(diffOTUs$pvalue,method = "fdr")
# Order according to most significant
diffOTUs = diffOTUs[order(diffOTUs$pvalue),]
# Keep only significant microbes (P < 0.05)
diffOTUs = diffOTUs[diffOTUs$pvalue < 0.06,]

# Lets make a heatmap of the top differentially abundant microbes
hm.data = as.matrix(log10(OTU_filtered[rownames(diffOTUs),]+1e-10))
# Create annotations for heatmap
ha = HeatmapAnnotation(Response = as.factor(gsub(".*_R","R",gsub(".*_NR","NR",colnames(OTU_filtered)))),
                       Study = as.factor(substr(colnames(OTU_filtered), start = 6, stop = 8)),
                       col = list(Response = c(NR ="violetred", R = "darkturquoise"),
                                  Study = c(Gop = "burlywood1",Fra = "indianred1", Mat = "firebrick4", Rou = "chocolate")))
# Define color palette
col_fun = colorRamp2(c(-10,-5,-2), c("slateblue4","maroon3","lightyellow"))

# Draw heatmap
Heatmap(hm.data,
        name = "log10 rel. ab.",
        #row_labels = as.character(gsub(".*s__","",diffOTUs$taxonomy)),
        clustering_distance_rows = "euclidean",
        clustering_distance_columns = "euclidean",
        show_column_names = FALSE,
        width = unit(20, "cm"),
        height = unit(5, "cm"),
        top_annotation = ha,
        col = col_fun,
        column_dend_height = unit(50, "mm"))

Let’s also run DESeq2 on our data

# Convert phyloseq object to DEseq object
ds_full = phyloseq_to_deseq2(physeq, ~ Study + Treatment + Response)
## converting counts to integer mode
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not an warning or error]
# OTU matrix has many zeros. We need to re-estimate the size factors.
# Iterative normalisation does not converge, lets use poscounts instead.
ds_full = estimateSizeFactors(ds_full, type = "poscounts")
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not an warning or error]
# Run DESeq2
ds_full = DESeq(ds_full, test = "Wald", fitType = "parametric")
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not an warning or error]
## final dispersion estimates
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not an warning or error]
## fitting model and testing
## 1 rows did not converge in beta, labelled in mcols(object)$betaConv. Use larger maxit argument with nbinomWaldTest
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not an warning or error]
## -- replacing outliers and refitting for 1237 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not an warning or error]
## fitting model and testing
## 1 rows did not converge in beta, labelled in mcols(object)$betaConv. Use larger maxit argument with nbinomWaldTest
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not an warning or error]
# Extract results
res_full = results(ds_full, cooksCutoff = FALSE)
# Set p-value threshold and filter out significant OTUs
alpha = 0.05
sigTab_full = res_full[which(res_full$padj < alpha),]
sigTab_full = cbind(as(sigTab_full, "data.frame"), as(tax_table(physeq)[rownames(sigTab_full), ], "matrix"))
head(sigTab_full)
# Lets make a heatmap of the top differentially abundant microbes
physeq_norm = transform_sample_counts(physeq, function(x) x/sum(x))
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'RNeXML'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'RNeXML'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'RNeXML'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'RNeXML'
OTU_norm = as.data.frame(otu_table(physeq_norm))

hm.data = log10(OTU_norm[rownames(sigTab_full),]+1e-10)
# Create annotations for heatmap
ha = HeatmapAnnotation(Response = as.factor(gsub(".*_R","R",gsub(".*_NR","NR",colnames(OTU)))),
                       Study = as.factor(substr(colnames(OTU), start = 6, stop = 8)),
                       col = list(Response = c(NR ="violetred", R = "darkturquoise"),
                                  Study = c(Gop = "burlywood1",Fra = "indianred1", Mat = "firebrick4", Rou = "chocolate")))
# Define color palette
col_fun = colorRamp2(c(-10,-5,-2), c("slateblue4","maroon3","lightyellow"))

# Draw heatmap
Heatmap(hm.data,
        name = "log10 rel. ab.",
        row_labels = sigTab_full$Species,
        clustering_distance_rows = "euclidean",
        clustering_distance_columns = "euclidean",
        show_column_names = FALSE,
        width = unit(20, "cm"),
        height = unit(5, "cm"),
        top_annotation = ha,
        col = col_fun,
        column_dend_height = unit(50, "mm"))
## Warning: The input is a data frame, convert it to the matrix.

# Store mOTUs IDs for all the significant OTUs
sigOTUs = rownames(sigTab_full)
# Subset OTU table in phyloseq object
physeq_subset = subset(otu_table(physeq_norm), rownames(otu_table(physeq_norm)) %in%  sigOTUs)
physeq_sig = merge_phyloseq(physeq_subset, tax_table(physeq_norm), sample_data(physeq_norm))
# Format data for ggplot
plotData = as.matrix(otu_table(physeq_sig))
rownames(plotData) = as.character(sigTab_full$Species)
plotData = melt(plotData)
plotData = cbind(plotData, Response = gsub(".*_R","R",gsub(".*_NR","NR",plotData$Var2)))
colnames(plotData) = c("mOTU","Patient","Abundance","Response")

plotData$Abundance = log10(as.numeric(plotData$Abundance) + 1e-8)

p_full = ggplot(data = plotData,aes(x=Response,y=Abundance,color=Response)) +
  facet_wrap(facets = "mOTU",ncol = 4, scales = "free") +
  geom_violin() +
  geom_jitter()
  
p_full

Random forests

Select taxonomic level:

physeq_f_level = taxa_level(physeq_f, which_level = "Species")
OTU_filtered = as.matrix(t(as.data.frame(otu_table(physeq_f_level))))

Pre-process data

# log transform the data
OTU_filtered = log10(OTU_filtered + 1e-8)
# Sort according to most abundant OTUs
OTU_filtered = OTU_filtered[order(rowSums(OTU_filtered),decreasing = TRUE),]

# Split into training and test data
clin_train = clin %>% filter(Study != "Routy_et_al")
OTU_train = as.data.frame(t(OTU_filtered[,colnames(OTU_filtered) %in% clin_train$Patient_id]))
clin_test = clin %>% filter(Study == "Routy_et_al")
OTU_test = as.data.frame(t(OTU_filtered[,colnames(OTU_filtered) %in% clin_test$Patient_id]))

#Concatenate Treatment, Study and Cancer_type information into data frame
OTU_train = cbind(OTU_train,
                  Treatment = clin_train$Treatment,
                  Study = clin_train$Study,
                  Cancer_type = clin_train$Cancer_type)
OTU_test = cbind(OTU_test,
                  Treatment = clin_test$Treatment,
                  Study = clin_test$Study,
                  Cancer_type = clin_test$Cancer_type)
set.seed(8)
# Construct random forest model for training data
rf.OTU <- randomForest(x=OTU_train,y=clin_train$Response, ntree = 10000 ,importance = TRUE)
# View confusion matrix
print(rf.OTU$confusion)
##    NR  R class.error
## NR 32 23   0.4181818
## R  33 15   0.6875000
# View the most important (Mean decrease of GINI impurity) features for prediction
rf.OTU.importance <- data.frame(importance(rf.OTU, type = 2)) %>%
  rownames_to_column('mOTU') %>%
  arrange(desc(MeanDecreaseGini))
print(head(rf.OTU.importance))
##                                           mOTU MeanDecreaseGini
## 1             Intestinimonas butyriciproducens        0.7671345
## 2 [Clostridium] sp. [C boltae/clostridioforme]        0.6042613
## 3                      Bifidobacterium bifidum        0.5048323
## 4                    Bilophila wadsworthia [C]        0.4955426
## 5              Subdoligranulum sp. 4_3_54A2FAA        0.4950698
## 6                      Anaerostipes hadrus [C]        0.4612936
# Let's plot the top predictiors
nPlot = 20
# Select top predictors from OTU matrix and format for ggplot
plotData = as.matrix(OTU_train[,rf.OTU.importance$mOTU[1:nPlot]])
plotData = melt(plotData)
plotData = cbind(plotData, Response = gsub(".*_R","R",gsub(".*_NR","NR",plotData$Var1)))
colnames(plotData) = c("Patient","mOTU","Abundance","Response")
p_RFpredictors = ggplot(data = plotData,aes(x=Response,y=Abundance,color=Response)) +
  facet_wrap(facets = "mOTU",ncol = 4, scales = "free") +
  geom_violin() +
  geom_jitter() +
  theme_classic() +
  theme(plot.title = element_text(face="bold")) +
  theme(text = element_text(family = "Helvetica")) +
  theme(axis.text.x=element_blank()) +
  scale_color_manual(values=c("violetred", "darkturquoise")) +
  ylab("log-normalised relative abundance") +
  xlab("")
p_RFpredictors

Let’s generate a ROC curve of our model

rf.train.roc = roc(as.factor(clin_train$Response),rf.OTU$votes[,2])
## Setting levels: control = NR, case = R
## Setting direction: controls > cases
plot(rf.train.roc)

auc(rf.train.roc)
## Area under the curve: 0.6034
# That's pretty bad, lets try vaying number of trees
n_tree = c(50, 100, 500, 1000, 2000, 10000, 20000)
plotData = data.frame(n_tree, AUC = 1:length(n_tree))
for (i in 1:length(n_tree)) {
  set.seed(8)
  print(paste("number of trees: ",n_tree[i]))
  rf.OTU <- randomForest(x=OTU_train,y=as.factor(clin_train$Response), ntree = n_tree[i] ,importance = TRUE)
  rf.train.roc = roc(as.factor(clin_train$Response),rf.OTU$votes[,2])
  plotData$AUC[i] = auc(rf.train.roc)
}
## [1] "number of trees:  50"
## Setting levels: control = NR, case = R
## Setting direction: controls > cases
## [1] "number of trees:  100"
## Setting levels: control = NR, case = R
## Setting direction: controls > cases
## [1] "number of trees:  500"
## Setting levels: control = NR, case = R
## Setting direction: controls > cases
## [1] "number of trees:  1000"
## Setting levels: control = NR, case = R
## Setting direction: controls > cases
## [1] "number of trees:  2000"
## Setting levels: control = NR, case = R
## Setting direction: controls > cases
## [1] "number of trees:  10000"
## Setting levels: control = NR, case = R
## Setting direction: controls > cases
## [1] "number of trees:  20000"
## Setting levels: control = NR, case = R
## Setting direction: controls > cases
p_RFperf = ggplot(data = plotData, aes(x = n_tree, y = AUC)) +
  geom_line() +
  geom_point() +
  xlab("no. of trees") +
  ylab("AUC") +
  ggtitle("Random forest on OTU matrix") +
  theme(plot.title = element_text(face="bold")) +
  theme(text = element_text(family = "Helvetica"))
p_RFperf

Combined enrichent

In order to enrich for predictive microbes in our Random forest, let’s first enrich for microbes in the melanoma data through DESeq2 and then build a random forest model using only these microbes.

Select phylogenetic level and subset melanoma samples:

physeq_mel = subset_samples(physeq, Cancer_type == "MM")
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'RNeXML'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'RNeXML'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'RNeXML'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'RNeXML'

Run DESeq2

# Convert phyloseq object to DEseq object
ds_full = phyloseq_to_deseq2(physeq_mel, ~ Study + Treatment + Response)
## converting counts to integer mode
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not an warning or error]
# OTU matrix has many zeros. We need to re-estimate the size factors.
# Iterative normalisation does not converge, lets use poscounts instead.
ds_full = estimateSizeFactors(ds_full, type = "poscounts")
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not an warning or error]
# Run DESeq2
ds_full = DESeq(ds_full, test = "Wald", fitType = "parametric")
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not an warning or error]
## final dispersion estimates
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not an warning or error]
## fitting model and testing
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not an warning or error]
## -- replacing outliers and refitting for 986 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not an warning or error]
## fitting model and testing
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not an warning or error]
# Extract results
res_full = results(ds_full, cooksCutoff = FALSE)
# Set p-value threshold and filter out significant OTUs
alpha = 0.05
sigTab_full = res_full[which(res_full$padj < alpha),]
sigTab_full = cbind(as(sigTab_full, "data.frame"), as(tax_table(physeq_mel)[rownames(sigTab_full), ], "matrix"))

Train RF model using these predictors

OTU_filtered = as.matrix(as.data.frame(otu_table(physeq)))
# Convert to relative abundance
OTU = apply(OTU, 2, function(x){x/sum(x)})
# log transform the data
OTU_filtered = log10(OTU_filtered + 1e-8)
# Extract microbes from DESeq2 output
OTU_filtered = OTU_filtered[rownames(sigTab_full),]

# Split into training and test data
clin_train = clin %>% filter(Study != "Routy_et_al")
OTU_train = as.data.frame(t(OTU_filtered[,colnames(OTU_filtered) %in% clin_train$Patient_id]))
clin_test = clin %>% filter(Study == "Routy_et_al")
OTU_test = as.data.frame(t(OTU_filtered[,colnames(OTU_filtered) %in% clin_test$Patient_id]))

#Concatenate Treatment, Study and Cancer_type information into data frame
OTU_train = cbind(OTU_train,
                  Treatment = clin_train$Treatment,
                  Study = clin_train$Study,
                  Cancer_type = clin_train$Cancer_type)
OTU_test = cbind(OTU_test,
                  Treatment = clin_test$Treatment,
                  Study = clin_test$Study,
                  Cancer_type = clin_test$Cancer_type)

set.seed(8)
# Construct random forest model for training data
rf.OTU <- randomForest(x=OTU_train,y=clin_train$Response, ntree = 10000 ,importance = TRUE)
# View confusion matrix
print(rf.OTU$confusion)
##    NR  R class.error
## NR 39 16   0.2909091
## R  23 25   0.4791667
# View the most important (Mean decrease of GINI impurity) features for prediction
rf.OTU.importance <- data.frame(importance(rf.OTU, type = 2)) %>%
  rownames_to_column('mOTU') %>%
  arrange(desc(MeanDecreaseGini))
print(head(rf.OTU.importance))
##                mOTU MeanDecreaseGini
## 1 meta_mOTU_v2_6916         5.766469
## 2  ref_mOTU_v2_0786         4.782548
## 3 meta_mOTU_v2_7059         3.982348
## 4 meta_mOTU_v2_6276         3.592474
## 5 meta_mOTU_v2_6091         3.247095
## 6  ref_mOTU_v2_5296         2.697667
# Let's plot the top predictiors

# Select top predictors from OTU matrix and format for ggplot
plotData = OTU_filtered
plotData = melt(plotData)
plotData = cbind(plotData, Response = gsub(".*_R","R",gsub(".*_NR","NR",plotData$Var2)))
colnames(plotData) = c("mOTU","Patient","Abundance","Response")
p_RFpredictors = ggplot(data = plotData,aes(x=Response,y=Abundance,color=Response)) +
  facet_wrap(facets = "mOTU",ncol = 4, scales = "free") +
  geom_violin() +
  geom_jitter() +
  theme_classic() +
  theme(plot.title = element_text(face="bold")) +
  theme(text = element_text(family = "Helvetica")) +
  theme(axis.text.x=element_blank()) +
  scale_color_manual(values=c("violetred", "darkturquoise")) +
  ylab("log-normalised relative abundance") +
  xlab("")
p_RFpredictors

Let’s generate a ROC curve of our model

rf.train.roc = roc(as.factor(clin_train$Response),rf.OTU$votes[,2])
## Setting levels: control = NR, case = R
## Setting direction: controls < cases
plot(rf.train.roc)

auc(rf.train.roc)
## Area under the curve: 0.633